Using a randomForest to find lowcarbon and crash locations in the JULES ensemble.

Useful tutorial for random forests in R:
https://www.r-bloggers.com/2021/04/random-forest-in-r/

set.seed(42)

library(RColorBrewer)
library(e1071)
library(MASS)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
load('data/wave0_summary.RData')

Plot the data

d <- ncol(X)
lowcarbon_ix <- which(Y_char =='LOWCARBON')
crashed_ix <- which(Y_char == 'CRASHED')

x1 <- X[lowcarbon_ix , ]
x2 <- X[crashed_ix ,]

XY <- rbind(x1, x2)

pairs(XY,
      lower.panel=function(x, y, ...) {
        Xx <- x[seq_len(nrow(x1))] 
        Xy <- y[seq_len(nrow(x1))] 

        points(Xx, Xy, col = 'blue', pch = 19, cex = 0.8)
      }, 
      upper.panel=function(x, y, ...) {
        Yx <- x[(nrow(x1) + 1):length(x)]
        Yy <- y[(nrow(x1) + 1):length(y)] 
        

        points(Yx, Yy, col = 'red', pch = 19, cex = 0.8)

      }, 
      gap = 0,
      xlim = c(0,1), ylim = c(0,1),
      labels = 1:d,
      oma = c(2, 18, 2, 2)) # move the default tick labels off the plot 


reset <- function()
  {
  # Allows annotation of graphs, resets axes
  par(mfrow=c(1, 1), oma=rep(0, 4), mar=rep(0, 4), new=TRUE)
  plot(0:1, 0:1, type="n", xlab="", ylab="", axes=FALSE)
}
reset()

legend('left', legend = paste(1:d, colnames(lhs)), cex = 1.1, bty = 'n')
legend('topleft', pch = 19, col = c('red', 'blue'), legend = c('CRASHED', 'LOWCARBON'), bty = 'n', inset = 0.02, cex = 1.1 )

data_df <- data.frame(X, y = as.factor(Y_char))

ix_train <- 1:399
ix_test <- 400:499

train <- data_df[ix_train, ]
test <- data_df[ix_test, ]
rf <- randomForest(y~., data=train, proximity=TRUE)
print(rf)
## 
## Call:
##  randomForest(formula = y ~ ., data = train, proximity = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 5
## 
##         OOB estimate of  error rate: 7.52%
## Confusion matrix:
##           CRASHED LOWCARBON RAN class.error
## CRASHED         0         6  12  1.00000000
## LOWCARBON       0        50   9  0.15254237
## RAN             0         3 319  0.00931677

Confusion matrix for the training set

p1 <- predict(rf, train)
confusionMatrix(p1, train$y)
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  CRASHED LOWCARBON RAN
##   CRASHED        18         0   0
##   LOWCARBON       0        59   0
##   RAN             0         0 322
## 
## Overall Statistics
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9908, 1)
##     No Information Rate : 0.807      
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
## 
## Statistics by Class:
## 
##                      Class: CRASHED Class: LOWCARBON Class: RAN
## Sensitivity                 1.00000           1.0000      1.000
## Specificity                 1.00000           1.0000      1.000
## Pos Pred Value              1.00000           1.0000      1.000
## Neg Pred Value              1.00000           1.0000      1.000
## Prevalence                  0.04511           0.1479      0.807
## Detection Rate              0.04511           0.1479      0.807
## Detection Prevalence        0.04511           0.1479      0.807
## Balanced Accuracy           1.00000           1.0000      1.000

Confusion matrix for the test set

p2 <- predict(rf, test)
confusionMatrix(p2, test$y)
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  CRASHED LOWCARBON RAN
##   CRASHED         0         0   0
##   LOWCARBON       4        18   3
##   RAN             2         4  69
## 
## Overall Statistics
##                                          
##                Accuracy : 0.87           
##                  95% CI : (0.788, 0.9289)
##     No Information Rate : 0.72           
##     P-Value [Acc > NIR] : 0.0002808      
##                                          
##                   Kappa : 0.679          
##                                          
##  Mcnemar's Test P-Value : 0.1048630      
## 
## Statistics by Class:
## 
##                      Class: CRASHED Class: LOWCARBON Class: RAN
## Sensitivity                    0.00           0.8182     0.9583
## Specificity                    1.00           0.9103     0.7857
## Pos Pred Value                  NaN           0.7200     0.9200
## Neg Pred Value                 0.94           0.9467     0.8800
## Prevalence                     0.06           0.2200     0.7200
## Detection Rate                 0.00           0.1800     0.6900
## Detection Prevalence           0.00           0.2500     0.7500
## Balanced Accuracy              0.50           0.8642     0.8720
plot(rf)

t <- tuneRF(train[,-5], train[,5],
       stepFactor = 0.5,
       plot = TRUE,
       ntreeTry = 150,
       trace = TRUE,
       improve = 0.05)
## mtry = 10  OOB error = 0.08717852 
## Searching left ...
## mtry = 20    OOB error = 0.08900195 
## -0.020916 0.05 
## Searching right ...
## mtry = 5     OOB error = 0.08722938 
## -0.0005833487 0.05

hist(treesize(rf),
     main = "No. of Nodes for the Trees",
     col = "grey")

Variable importance plot clearly identifies the two most important parameters

#Variable Importance
varImpPlot(rf,
           sort = T,
           n.var = 10,
           main = "Top 10 - Variable Importance")

importance(rf)
##                  MeanDecreaseGini
## alpha_io                 2.682637
## a_wl_io                  2.763502
## bio_hum_cn               3.341003
## b_wl_io                 21.133500
## dcatch_dlai_io           2.165257
## dqcrit_io                3.323660
## dz0v_dh_io               2.146042
## f0_io                   34.293527
## fd_io                    2.067817
## g_area_io                1.862012
## g_root_io                1.768203
## g_wood_io                2.068497
## gs_nvg_io                1.676781
## hw_sw_io                 3.065932
## kaps_roth                2.699277
## knl_io                   2.627970
## lai_max_io               2.697069
## lai_min_io               2.862380
## lma_io                   1.982686
## n_inorg_turnover         1.523937
## nmass_io                 2.651071
## nr_io                    2.250479
## retran_l_io              2.732983
## retran_r_io              2.342289
## r_grow_io                2.141003
## rootd_ft_io              4.591724
## sigl_io                  2.353085
## sorp                     1.812005
## tleaf_of_io              3.416385
## tlow_io                  2.150968
## tupp_io                  2.263766
## l_vg_soil                2.136931
#MeanDecreaseGini

Partial importance of the two most important parameters

partialPlot(rf, train, f0_io)

partialPlot(rf, train, b_wl_io)

MDSplot(rf, train$y)

Sample from the whole input space and make predictions

samp <- runif(16000)

samp_mat <- matrix(data = samp, nrow = 500)

colnames(samp_mat) <- colnames(X)

samp_pred <- predict(rf, samp_mat)
pal = c('red', 'blue','grey')
plot(samp_mat[,4], samp_mat[,8], col = pal[samp_pred], xlab = colnames(X)[4], ylab = colnames(X)[8])

The pairs plot looks very similar to the test data set at the start of the page

pairs(
  samp_mat[samp_pred=='LOWCARBON', ],
  gap = 0,
  col = 'blue',
  upper.panel = NULL,
  pch = 20,
  labels = 1:d
  
)

reset()

legend('right', legend = paste(1:d, colnames(lhs)), cex = 1.1, bty = 'n')
legend('topright', pch = 19, col = c('red', 'blue'), legend = c('failed', 'zero carbon cycle'), bty = 'n', inset = 0.02, cex = 1.1 )

## Density of lowcarbon

How does classifier accuracy change with ensemble size?

# This function does a simple bootstrap test of the randomForest algorithm
# for a set number of ensemble members (ntrain). It splits the data into a training
# set of ntrain rows and a test set of ntest rows, trains the model, predicts the
# test set and records the misclassification rate for each rep in the oputput.

boot_rf <- function(data, ntrain, ntest, nreps){
  
  outvec <- rep(NA, nreps)
    
  for(i in 1:nreps){
    
    n_train_and_test <- ntrain+ntest
    
    all_samp_ix <- sample(1:nrow(X), n_train_and_test)
    train_ix <- all_samp_ix[1:ntrain]
    test_ix <- all_samp_ix[(ntrain+1):n_train_and_test]
    
    train <- data[train_ix, ]
    test  <- data[test_ix ,]
    
    
    rf <- randomForest(y~., data=train, proximity=TRUE)
    
      
    pred_test <- predict(rf, test)
    conf_test <- confusionMatrix(pred_test, test$y)
    
    out <- (1 - conf_test$overall['Accuracy'])
    outvec[i] <- out
    
  }
  
  outvec
  
}
# test the above
test_boot <- boot_rf(data = data_df, ntrain = 100, ntest = 10, nreps = 10)

How does misclassification rate vary with ensemble size?

nreps <- 50

nens_vec <- seq(from = 100, to = 400, by = 20)

outmat <- matrix(nrow = nreps, ncol = length(nens_vec))

for(j in 1:length(nens_vec)){
  
  boot_rf_out  <- boot_rf(data = data_df, ntrain = nens_vec[j], ntest = 50, nreps = nreps)
  
  outmat[, j] <- boot_rf_out
  
}
# 1- accuracy mean
oma_mean <- apply(outmat,2,mean)

plot(nens_vec, oma_mean, type = 'b' )

boxplot( outmat)